Prep

#' DEAprep
#'
#' Generates control samples out of the average values over all supplied samples, combines spliced and unspliced
#' assays and calculates their logCPM & log2FC data. 
#' All is assembled into one SE object. 
#' 
#'
#' @param se SummarizedExperiment object containing assays of raw counts of spliced & unspliced tx
#'
#' @return an SE object of combined spliced and unspliced data as well as control samples generated from averaging over all treatment samples
#' 
DEAprep <- function(se){
  
  # allocation
  
  e1 <- assays(se)$spliced
  e2 <- assays(se)$unspliced
  readtype1 <- "spliced"
  readtype2 <- "unspliced"
  
  
  # generate control
  
  ## create logcpm assays for both spliced & unspliced assays
  assays(se)$logcpm.spliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$spliced))))
  assays(se)$logcpm.unspliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$unspliced))))

  ## generate a 'control' sample out of the median normalized counts over all samples
  e.ctrl1 <- genCTRL(se, readtype1, "logcpm.spliced")
  e.ctrl2 <- genCTRL(se, readtype2, "logcpm.unspliced")
  
  ## add control samples to assays & generate DGEList object
  dds1 <- DGEList(cbind(e1, e.ctrl1))
  dds2 <- DGEList(cbind(e2, e.ctrl2))
  
  ## combine the spliced & unspliced assays
  dds <- cbind(dds1, dds2)

  
  # generate colData
  
  dd1 <- rbind(colData(se)[,c("Batch","miRNA","Cell_Line")], 
               data.frame(Batch=unique(se$Batch), miRNA="CTRL", Cell_Line=unique(se$Cell_Line)))
  dd1$readtype <- readtype1
  
  ## duplicate dd to have data for combined spliced & unspliced assay
  dd <- rbind(dd1, dd1)
  dd$readtype[(nrow(dd1)+1):nrow(dd)] <- readtype2
  dd$readtype <- as.factor(dd$readtype)
  
  ## rename both dds & dd object features
  n.cols1 <- sapply(colnames(dds1), FUN=function(x)
      var_name <- paste(x, readtype1, sep=".") )
  n.cols2 <- sapply(colnames(dds2), FUN=function(x)
      var_name <- paste(x, readtype2, sep=".") )
  
  colnames(dds) <- c(n.cols1, n.cols2)
  rownames(dd) <- c(n.cols1, n.cols2)
  
  
  # rebuild SE object
  
  ## SE object with logCPM & logFC assays
  se <- SummarizedExperiment(assays=list(counts=dds$counts), 
                             rowData=rowData(se), 
                             colData=dd) 
  assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
  se <- SEtools::log2FC(se, controls = se$miRNA=="CTRL", by = paste(se$Batch, se$readtype), fromAssay = "logcpm")
  
  return(se)
}

PCA

HeLa

HEK

DEA

#' exonDEA
#'
#' @param se SE object containing assays with combined spliced & unspliced counts
#' @param model function: model matrix function to test
#' @param model0 function: model matrix function to be tested against
#' @param control string: name of control samples inside colData() of supplied SE object
#'
#' @return SE object containing DEA results over all treatments & over individual ones
#' 
exonDEA <- function(se, model, model0=~1, control){
  
  ## allocation
  se$miRNA <- relevel(droplevels(se$miRNA), ref=control)
  se$readtype <- relevel(se$readtype, ref="unspliced")
  se$Batch <- droplevels(se$Batch)
  dd <- colData(se)
  ## normalization
  dds <- calcNormFactors(DGEList(assays(se)$counts))
  ## models
  mm <- model.matrix(model, data=dd)
  mm0 <- model.matrix(model0, data=dd)
  testCoeff <- setdiff(colnames(mm), colnames(mm0))
  ## estimate dispersion
  dds <- estimateDisp(dds,mm)
  ## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
  fit <- glmFit(dds, mm)
  ## fit a GLM on the data
  lrt.comb <- glmLRT(fit, testCoeff)
  ## top genes that change relative to stage 0
  res.comb <- as.data.frame(topTags(lrt.comb, Inf))
  ## fit linear model dropping one sample at a time (using multiple cores)
  res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
    as.data.frame(topTags(glmLRT(fit, x),Inf))
  })
  dea.names <- gsub("readtype","", testCoeff)
  dea.names <- make.names(gsub(":miRNA",".", dea.names))
  colnames(res.comb)[grepl("logFC",colnames(res.comb))] <- paste0("logFC.", dea.names)
  names(res.list) <- paste0("DEA.",dea.names)
  
  ## add DEAs
  rowData(se)$DEA.spliced.all <- DataFrame(res.comb[rownames(se),])
  for(i in paste0("DEA.",dea.names)){
    rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
  }

  return(se)
}

Plots

HeLa

HEK

Stats

## [1] 0.2207416
## [1] 0.2002699
## $`1e-10`
## 
##   -1    0    1 
## 3209  910 2979 
## 
## $`1e-05`
## 
##    -1     0     1 
## 11736  3360 11112 
## 
## $`0.05`
## 
##    -1     0     1 
## 44144 12912 42472 
## 
## $`0.1`
## 
##    -1     0     1 
## 51611 15156 49687 
## 
## $`1`
## 
##     -1      0      1 
## 129185  43457 126722
## $`1e-10`
## 
##   -1    0    1 
## 3311  828 3313 
## 
## $`1e-05`
## 
##    -1     0     1 
## 10530  2628 10494 
## 
## $`0.05`
## 
##    -1     0     1 
## 42032 10560 42124 
## 
## $`0.1`
## 
##    -1     0     1 
## 50116 12664 50242 
## 
## $`1`
## 
##     -1      0      1 
## 162550  46560 165326
# compare to non-exon-specific DEA results

## construct non-exon-specific SE object
### load
se.comb.hela <- readRDS("data/bartel.hela.DEA.SE.rds")
rowData(se.comb.hela) <- lapply(rowData(se.comb.hela), FUN=function(x){ if(is.data.frame(x)) x <- DataFrame(x); x} )
### only select genes common with our exon-specific dataset
common <- intersect(rowData(se.hela.comb)$gene_name, rowData(se.comb.hela)$symbol)
se.comb.hela <- se.comb.hela[common,]

### generate control samples
e.ctrl <- genCTRL(se.comb.hela, "counts", "logcpm")

### add control samples to assays & generate DGEList object
dds <- DGEList(cbind(assay(se.comb.hela), e.ctrl))

### generate colData
dd <- colData(se.comb.hela)[,c("Batch","miRNA","Cell_Line")]
dd.ctrl <- data.frame(Batch=unique(se.comb.hela$Batch), 
                       miRNA="CTRL", 
                       Cell_Line=unique(as.character(dd$Cell_Line)) )
dd <- rbind(dd, dd.ctrl)

### generate SE & logCPM & log2FC assays
se.comb.hela <- SummarizedExperiment(assays=list(counts=dds$counts), rowData=rowData(se.comb.hela), colData=dd)
assays(se.comb.hela)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se.comb.hela)))))
se.comb.hela <- SEtools::log2FC(se.comb.hela, controls = se.comb.hela$miRNA=="CTRL", by = se.comb.hela$Batch, fromAssay = "logcpm")

rowData(se.comb.hela)$DEA.all <- DataFrame(FDR=unlist(
  bplapply(1:nrow(se.comb.hela), function(i){
    fdr <- sapply(rowData(se.comb.hela)[-1], function(x){
      x[[i,"FDR"]]
      })
    min(fdr)
    }, BPPARAM = MulticoreParam(8, progressbar = FALSE) )
  ), row.names = rownames(se.comb.hela) )



## logFC heatmap HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
SEtools::sehm(se.comb.hela[,order(colData(se.comb.hela)$miRNA)], row.names(se.comb.hela)[rowData(se.comb.hela)$DEA.all$FDR < 0.01],
              breaks=T, do.scale = F, show_colnames = FALSE, assayName = "log2FC", anno_columns = "miRNA")

## logCPM vs logFC scatterplot HeLa comb
LSD:::heatscatter(as.vector(assays(se.comb.hela)$logcpm),as.vector(assays(se.comb.hela)$log2FC))
abline(a=0, b=0)

## logFC signs at different significance levels 
signTables(se.comb.hela, sig, "DEA.all")

Investigate upregulated spliced tx

Analysis

# Check if log2FC upregulations are due to skewed controls 


ctrlAnalysis <- function(se, pert, TS, fc.degree){
  
  ## vector of Bartel miRNA treatments and their corresponding seeds (based on actual miRNA seqs)
  pert.seed <- sapply(se$miRNA[se$miRNA!="CTRL" & se$readtype=="spliced"], function(x){
    as.character(pert$seed[grepl(paste0(x,"$"), pert$treatment) | grepl(paste0(x,"\\("), pert$treatment)])
    })
  names(pert.seed) <- se$miRNA[se$miRNA!="CTRL" & se$readtype=="spliced"]
  
  ## adjusted gene names to be identical to TS names
  fc.genes <- as.character(rowData(se)$gene_name)
  ## only consider common genes
  id <- fc.genes %in% TS$feature
  ## significant ids
  id.sig <- id & rowData(se)$DEA.spliced.all$FDR < .05
  
  ## aggregate logFC by genes
  logfc <- cbind(as.data.frame(assays(se)$log2FC),fc.genes)
  logfc <- aggregate(logfc[,colnames(logfc)!="fc.genes"], by=list(logfc$fc.genes), FUN="sum")
  rownames(logfc) <- logfc$Group.1
  logfc.sign <- logfc.sign.deg <- sign(logfc[,colnames(logfc)!="Group.1"])
  
  ## update gene names, DEA and IDs: aggregate by genes
  ### aggregate DEA FDR info
  fdr <- rowData(se)$DEA.spliced.all
  rownames(fdr) <- fc.genes
  fdr <- aggregate(fdr[fc.genes[id],"FDR"], by=list(rownames(fdr[fc.genes[id],])), FUN="min")
  rownames(fdr) <- fdr$Group.1
  colnames(fdr) <- c("genes","FDR")
  ### adjusted gene names to be identical to TS names
  fc.genes <- rownames(logfc)
  ### only consider common genes
  id <- fc.genes %in% TS$feature
  ### significant ids
  id.sig <- fc.genes %in% rownames(fdr)[fdr$FDR<.05]
  
  ## for splicing: only genes common with TS & with logfc createer than 'fc.degree' & significant FDR
  logfc.sign.deg[logfc[,colnames(logfc)!="Group.1"] < fc.degree] <- 0
  logfc.sign.deg <- logfc.sign.deg[rownames(logfc.sign.deg) %in% fc.genes[id.sig] & rowSums(logfc.sign.deg)>0, se$miRNA!="CTRL"]
  ## for control: only genes common with TS
  logfc.sign <- logfc.sign[rownames(logfc.sign) %in% fc.genes[id], se$miRNA!="CTRL"]
             
  
  ## to find treatment miRNA for each gene for which the logFC are upregulated
  nonUpregSeeds <- function(readtype, logfc, pert.seed){
    l <- lapply(readtype, function(i){
      apply(logfc[,readtype==i], 1, function(x){
        pert.seed[x!=1]
        })
      })
    return(l)
  }
  
  ## splicing: find treatment miRNAs potentially targeting each gene
  n <- nonUpregSeeds(unique(se$readtype), logfc.sign.deg, pert.seed)
  names(n) <- unique(se$readtype)
  ## control: background distribution of upregulation instances for each gene
  m <- nonUpregSeeds(unique(se$readtype), logfc.sign, pert.seed)
  names(m) <- paste0("ctrl.",unique(se$readtype))
  ## combine
  n <- c(n,m)
  
  
  ## for each common gene select the miRNA families that target them (using TargetScan data)
  ts.all <- lapply(fc.genes[id], function(x){
    TS$family[TS$feature==x]
    })
  names(ts.all) <- fc.genes[id]
  ts.list <- list(splicing=ts.all[fc.genes[id.sig]], ctrl=ts.all[fc.genes[id]])
  
  ## ratio of treatment miRNA are in the TS target list
  ratios <- lapply(n, function(x){
    if( length(x)==nrow(logfc.sign) ){
      sapply( names(x), function(y) sum(x[[y]] %in% ts.list$ctrl[[y]])/length(x[[y]]) )
    } else {
      sapply( names(x), function(y) sum(x[[y]] %in% ts.list$splicing[[y]])/length(x[[y]]) )
    }
  })
  
  # output
  
  return(list(n=n,ratios=ratios))
}

plots HeLa

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##       32.67519       33.03581       18.14680       18.18031
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##             33             33             18             18

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##     0.12722149     0.12731275     0.06620080     0.06598362
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##      0.1176471      0.1176471      0.0000000      0.0000000

plots HEK

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##       23.05435       23.39493       12.31965       12.21222
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##             23             24             12             12

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##     0.12722149     0.12731275     0.06620080     0.06598362
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##      0.1176471      0.1176471      0.0000000      0.0000000